COVID-19 Analysis & Visualization¶

In [5]:
# Data analysis and Manipulation
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt

# Importing Plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)

# Initializing Plotly
pio.renderers.default = 'notebook'
In [6]:
# Importing Dataset1
dataset1 = pd.read_csv("covid.csv")
dataset1.head() # returns first 5 rows
Out[6]:
Country/Region Continent Population TotalCases NewCases TotalDeaths NewDeaths TotalRecovered NewRecovered ActiveCases Serious,Critical Tot Cases/1M pop Deaths/1M pop TotalTests Tests/1M pop WHO Region iso_alpha
0 USA North America 3.311981e+08 5032179 NaN 162804.0 NaN 2576668.0 NaN 2292707.0 18296.0 15194.0 492.0 63139605.0 190640.0 Americas USA
1 Brazil South America 2.127107e+08 2917562 NaN 98644.0 NaN 2047660.0 NaN 771258.0 8318.0 13716.0 464.0 13206188.0 62085.0 Americas BRA
2 India Asia 1.381345e+09 2025409 NaN 41638.0 NaN 1377384.0 NaN 606387.0 8944.0 1466.0 30.0 22149351.0 16035.0 South-EastAsia IND
3 Russia Europe 1.459409e+08 871894 NaN 14606.0 NaN 676357.0 NaN 180931.0 2300.0 5974.0 100.0 29716907.0 203623.0 Europe RUS
4 South Africa Africa 5.938157e+07 538184 NaN 9604.0 NaN 387316.0 NaN 141264.0 539.0 9063.0 162.0 3149807.0 53044.0 Africa ZAF
In [7]:
# Returns tuple of shape (Rows, columns)
print(dataset1.shape)

# Returns size of dataframe
print(dataset1.size)
(209, 17)
3553
In [8]:
# Information about Dataset1
# return concise summary of dataframe
dataset1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country/Region    209 non-null    object 
 1   Continent         208 non-null    object 
 2   Population        208 non-null    float64
 3   TotalCases        209 non-null    int64  
 4   NewCases          4 non-null      float64
 5   TotalDeaths       188 non-null    float64
 6   NewDeaths         3 non-null      float64
 7   TotalRecovered    205 non-null    float64
 8   NewRecovered      3 non-null      float64
 9   ActiveCases       205 non-null    float64
 10  Serious,Critical  122 non-null    float64
 11  Tot Cases/1M pop  208 non-null    float64
 12  Deaths/1M pop     187 non-null    float64
 13  TotalTests        191 non-null    float64
 14  Tests/1M pop      191 non-null    float64
 15  WHO Region        184 non-null    object 
 16  iso_alpha         209 non-null    object 
dtypes: float64(12), int64(1), object(4)
memory usage: 27.9+ KB
In [9]:
# Importing Dataset2
dataset2 = pd.read_csv("covid_grouped.csv")
dataset2.head() # return first 5 rows of dataset2
Out[9]:
Date Country/Region Confirmed Deaths Recovered Active New cases New deaths New recovered WHO Region iso_alpha
0 2020-01-22 Afghanistan 0 0 0 0 0 0 0 Eastern Mediterranean AFG
1 2020-01-22 Albania 0 0 0 0 0 0 0 Europe ALB
2 2020-01-22 Algeria 0 0 0 0 0 0 0 Africa DZA
3 2020-01-22 Andorra 0 0 0 0 0 0 0 Europe AND
4 2020-01-22 Angola 0 0 0 0 0 0 0 Africa AGO
In [10]:
# Returns tuple of shape (Rows, columns)
print(dataset2.shape)

# Returns size of dataframe
print(dataset2.size)
(35156, 11)
386716
In [11]:
# Information about Dataset2
dataset2.info() # return concise summary of dataframe
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35156 entries, 0 to 35155
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            35156 non-null  object
 1   Country/Region  35156 non-null  object
 2   Confirmed       35156 non-null  int64 
 3   Deaths          35156 non-null  int64 
 4   Recovered       35156 non-null  int64 
 5   Active          35156 non-null  int64 
 6   New cases       35156 non-null  int64 
 7   New deaths      35156 non-null  int64 
 8   New recovered   35156 non-null  int64 
 9   WHO Region      35156 non-null  object
 10  iso_alpha       35156 non-null  object
dtypes: int64(7), object(4)
memory usage: 3.0+ MB
In [12]:
# Columns labels of a Dataset1
dataset1.columns
Out[12]:
Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region', 'iso_alpha'],
      dtype='object')
In [13]:
# Drop NewCases, NewDeaths, NewRecovered rows from dataset1

dataset1.drop(['NewCases', 'NewDeaths', 'NewRecovered'],
			axis=1, inplace=True)

# Select random set of values from dataset1
dataset1.sample(5)
Out[13]:
Country/Region Continent Population TotalCases TotalDeaths TotalRecovered ActiveCases Serious,Critical Tot Cases/1M pop Deaths/1M pop TotalTests Tests/1M pop WHO Region iso_alpha
125 Rwanda Africa 12981546.0 2111 5.0 1258.0 848.0 NaN 163.0 0.4 286251.0 22051.0 Africa RWA
86 Tajikistan Asia 9557468.0 7665 62.0 6443.0 1160.0 NaN 802.0 6.0 NaN NaN Europe TJK
76 Bulgaria Europe 6942854.0 13014 435.0 7374.0 5205.0 47.0 1874.0 63.0 294087.0 42358.0 Europe BGR
199 New Caledonia Australia/Oceania 285769.0 22 NaN 22.0 0.0 NaN 77.0 NaN 11099.0 38839.0 NaN NCL
100 Equatorial Guinea Africa 1407001.0 4821 83.0 2182.0 2556.0 NaN 3426.0 59.0 44356.0 31525.0 Africa GNQ
In [14]:
# Import create_table Figure Factory

from plotly.figure_factory import create_table

colorscale = [[0, '#4d004c'], [.5, '#f2e5ff'], [1, '#ffffff']]
table = create_table(dataset1.head(15), colorscale=colorscale)
py.iplot(table)
In [15]:
px.bar(dataset1.head(15), x = 'Country/Region',
	y = 'TotalCases',color = 'TotalCases',
	height = 500,hover_data = ['Country/Region', 'Continent'])
In [16]:
px.bar(dataset1.head(15), x = 'Country/Region', y = 'TotalCases',
	color = 'TotalDeaths', height = 500,
	hover_data = ['Country/Region', 'Continent'])
In [17]:
px.bar(dataset1.head(15), x = 'Country/Region', y = 'TotalCases',
	color = 'TotalDeaths', height = 500,
	hover_data = ['Country/Region', 'Continent'])
In [18]:
px.bar(dataset1.head(15), x = 'Country/Region', y = 'TotalCases',
	color = 'TotalTests', height = 500, hover_data = ['Country/Region', 'Continent'])
In [19]:
px.bar(dataset1.head(15), x = 'TotalTests', y = 'Country/Region',
	color = 'TotalTests',orientation ='h', height = 500,
	hover_data = ['Country/Region', 'Continent'])
In [20]:
px.bar(dataset1.head(15), x = 'TotalTests', y = 'Continent',
	color = 'TotalTests',orientation ='h', height = 500,
	hover_data = ['Country/Region', 'Continent'])
In [21]:
px.scatter(dataset1, x='Continent',y='TotalCases',
		hover_data=['Country/Region', 'Continent'],
		color='TotalCases', size='TotalCases', size_max=80)
In [22]:
px.scatter(dataset1.head(54), x='Continent',y='TotalTests',
		hover_data=['Country/Region', 'Continent'],
		color='TotalTests', size='TotalTests', size_max=80)
In [23]:
px.scatter(dataset1.head(50), x='Continent',y='TotalTests',
		hover_data=['Country/Region', 'Continent'],
		color='TotalTests', size='TotalTests', size_max=80, log_y=True)
In [24]:
px.scatter(dataset1.head(100), x='Country/Region', y='TotalCases',
		hover_data=['Country/Region', 'Continent'],
		color='TotalCases', size='TotalCases', size_max=80)
In [25]:
px.scatter(dataset1.head(30), x='Country/Region', y='TotalCases',
		hover_data=['Country/Region', 'Continent'],
		color='Country/Region', size='TotalCases', size_max=80, log_y=True)
In [26]:
px.scatter(dataset1.head(10), x='Country/Region', y= 'TotalDeaths',
		hover_data=['Country/Region', 'Continent'],
		color='Country/Region', size= 'TotalDeaths', size_max=80)
In [27]:
px.scatter(dataset1.head(30), x='Country/Region', y= 'Tests/1M pop',
		hover_data=['Country/Region', 'Continent'],
		color='Country/Region', size= 'Tests/1M pop', size_max=80)
In [28]:
px.scatter(dataset1.head(30), x='Country/Region', y= 'Tests/1M pop',
		hover_data=['Country/Region', 'Continent'],
		color='Tests/1M pop', size= 'Tests/1M pop', size_max=80)
In [29]:
px.scatter(dataset1.head(30), x='TotalCases', y= 'TotalDeaths',
		hover_data=['Country/Region', 'Continent'],
		color='TotalDeaths', size= 'TotalDeaths', size_max=80)
In [30]:
px.scatter(dataset1.head(30), x='TotalCases', y= 'TotalDeaths',
		hover_data=['Country/Region', 'Continent'],
		color='TotalDeaths', size= 'TotalDeaths', size_max=80,
		log_x=True, log_y=True)
In [31]:
px.scatter(dataset1.head(30), x='TotalTests', y= 'TotalCases',
		hover_data=['Country/Region', 'Continent'],
		color='TotalTests', size= 'TotalTests', size_max=80,
		log_x=True, log_y=True)
In [32]:
px.bar(dataset2, x="Date", y="Confirmed", color="Confirmed",
	hover_data=["Confirmed", "Date", "Country/Region"], height=400)
In [33]:
px.bar(dataset2, x="Date", y="Confirmed", color="Confirmed",
	hover_data=["Confirmed", "Date", "Country/Region"],log_y=True, height=400)
In [34]:
px.bar(dataset2, x="Date", y="Deaths", color="Deaths",
	hover_data=["Confirmed", "Date", "Country/Region"],
	log_y=False, height=400)
In [35]:
df_US= dataset2.loc[dataset2["Country/Region"]=="US"]
In [36]:
px.bar(df_US, x="Date", y="Confirmed", color="Confirmed", height=400)
In [37]:
px.bar(df_US,x="Date", y="Recovered", color="Recovered", height=400)
In [38]:
px.line(df_US,x="Date", y="Recovered", height=400)
In [39]:
px.line(df_US,x="Date", y="Deaths", height=400)
In [40]:
px.line(df_US,x="Date", y="Confirmed", height=400)
In [41]:
px.line(df_US,x="Date", y="New cases", height=400)
In [42]:
px.bar(df_US,x="Date", y="New cases", height=400)
In [43]:
px.scatter(df_US, x="Confirmed", y="Deaths", height=400)
In [44]:
px.choropleth(dataset2,
			locations="iso_alpha",
			color="Confirmed",
			hover_name="Country/Region",
			color_continuous_scale="Blues",
			animation_frame="Date")
In [45]:
px.choropleth(dataset2,
			locations='iso_alpha',
			color="Deaths",
			hover_name="Country/Region",
			color_continuous_scale="Viridis",
			animation_frame="Date" )
In [46]:
px.choropleth(dataset2,
			locations='iso_alpha',
			color="Recovered",
			hover_name="Country/Region",
			color_continuous_scale="RdYlGn",
			projection="natural earth",
			animation_frame="Date" )
In [47]:
px.bar(dataset2, x="WHO Region", y="Confirmed", color="WHO Region",
	animation_frame="Date", hover_name="Country/Region")
In [48]:
dataset3= pd.read_csv("coviddeath.csv")
dataset3.head()
Out[48]:
Data as of Start Week End Week State Condition Group Condition ICD10_codes Age Group Number of COVID-19 Deaths Flag
0 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 0-24 122.0 NaN
1 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 25-34 596.0 NaN
2 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 35-44 1521.0 NaN
3 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 45-54 4186.0 NaN
4 08/30/2020 02/01/2020 08/29/2020 US Respiratory diseases Influenza and pneumonia J09-J18 55-64 10014.0 NaN
In [49]:
dataset3.tail()
Out[49]:
Data as of Start Week End Week State Condition Group Condition ICD10_codes Age Group Number of COVID-19 Deaths Flag
12255 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 65-74 5024.0 NaN
12256 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 75-84 5381.0 NaN
12257 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 85+ 4841.0 NaN
12258 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 Not stated NaN Counts less than 10 suppressed.
12259 08/30/2020 02/01/2020 08/29/2020 YC Coronavirus Disease 2019 COVID-19 U071 All ages 20628.0 NaN
In [50]:
dataset3.groupby(["Condition"]).count()
Out[50]:
Data as of Start Week End Week State Condition Group ICD10_codes Age Group Number of COVID-19 Deaths Flag
Condition
Adult respiratory distress syndrome 540 540 540 540 540 540 540 272 268
All other conditions and causes (residual) 540 540 540 540 540 540 540 363 177
Alzheimer disease 530 530 530 530 530 530 530 144 386
COVID-19 540 540 540 540 540 540 540 377 163
Cardiac arrest 520 520 520 520 520 520 520 219 301
Cardiac arrhythmia 540 540 540 540 540 540 540 192 348
Cerebrovascular diseases 530 530 530 530 530 530 530 187 343
Chronic lower respiratory diseases 540 540 540 540 540 540 540 229 311
Diabetes 540 540 540 540 540 540 540 276 264
Heart failure 540 540 540 540 540 540 540 204 336
Hypertensive diseases 540 540 540 540 540 540 540 264 276
Influenza and pneumonia 540 540 540 540 540 540 540 331 209
Intentional and unintentional injury, poisoning, and other adverse events 520 520 520 520 520 520 520 188 332
Ischemic heart disease 540 540 540 540 540 540 540 224 316
Malignant neoplasms 540 540 540 540 540 540 540 198 342
Obesity 530 530 530 530 530 530 530 182 348
Other diseases of the circulatory system 530 530 530 530 530 530 530 213 317
Other diseases of the respiratory system 540 540 540 540 540 540 540 188 352
Renal failure 540 540 540 540 540 540 540 238 302
Respiratory arrest 480 480 480 480 480 480 480 111 369
Respiratory failure 540 540 540 540 540 540 540 320 220
Sepsis 530 530 530 530 530 530 530 243 287
Vascular and unspecified dementia 530 530 530 530 530 530 530 191 339
In [51]:
# import word cloud
from wordcloud import WordCloud

sentences = dataset3["Condition"].tolist()
sentences_as_a_string = ' '.join(sentences)


# Convert the string into WordCloud
plt.figure(figsize=(20, 20))
plt.imshow(WordCloud().generate(sentences_as_a_string))
Out[51]:
<matplotlib.image.AxesImage at 0x21e4073dd30>
No description has been provided for this image
In [52]:
column2_tolist= dataset3["Condition Group"].tolist()

# Convert the list to one single string
column_to_string= " ".join(column2_tolist)

# Convert the string into WordCloud
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(column_to_string))
Out[52]:
<matplotlib.image.AxesImage at 0x21e41270410>
No description has been provided for this image
In [ ]: